# -*- coding:UTF-8 -*-
"""
@Project:   DataCrawler
@FileName:  main.py 
@CreateDate:2023/4/22 23:03  
@Author:    Jia  
@Desc:      爬虫调度器

-------------------------------------------------------
数据结构：
redis 集合              redis 哈希
-------------------------------------------------------
is_crawled  已爬        first_floor_data  compy:url
no_crawled  未爬
new_urls    新URL
fail_urlS   失败URL
--------------------------------------------------------
"""

from Common.url_manager import UrlManager
from Common.downloader import HtmlDownloader
from Common.html_parser import HtmlParser
from Common.output import Output
import traceback
from Config.log import Logs
from database import redis_db
import asyncio
import time
global new_url
global count
logger = Logs().debug_logger()
rds = redis_db.redis_conn2(logger)


class SpiderMain:

    def __init__(self):
        self.start_time = time.perf_counter()
        self.urls = UrlManager(rds)
        self.downloader = HtmlDownloader(logger)
        self.parser = HtmlParser(rds)
        self.export = Output(rds)

    async def crawling(self, url: str):
        """
        从一个初始url开始爬取
        :param url:初始的URL
        """
        global new_url
        global count
        count = 1
        # 将一个初始URL添加到待爬取的集合中
        self.urls.add_new_url(url)
        try:
            # 有未爬取的URL时
            while self.urls.has_no_crawled():
                # 从待爬取集合中取一个URL
                new_url = await self.urls.get_url()

                # 协程异步下载器下载页面
                html_content = await self.downloader.async_download(new_url)

                # 异步解析器解析下载的页面，输出解析数据和待爬的新URL
                await self.parser.parser(html_content)

                # 将新待爬URL加入redis no_crawled集合中
                # await self.urls.add_new_urls()

                task1 = asyncio.create_task(self.urls.get_url())
                task2 = asyncio.create_task(self.downloader.async_download(await task1))
                task3 = asyncio.create_task(self.parser.parser(await task2))
                # task4 = asyncio.create_task(self.urls.add_new_urls())

                await asyncio.gather(task1, task2, task3)

                # 集合所有爬取的数据
                # self.export.collect_data(new_data)

                count += 1
                if count == 100:
                    break

        except KeyboardInterrupt:
            logger.debug('键盘手动中断')
        except Exception:
            logger.info(f'crawling fail:{traceback.format_exc()}')
            self.urls.add_fail_urls(new_url)
        else:
            # 以HTML的形式输出爬取的数据
            self.export.export_html()

            # 关闭数据库连接
            end_time = time.perf_counter()
            use_time = end_time - self.start_time
            logger.info(f'本次运行共用时{round(use_time, 2)}秒')


if __name__ == '__main__':
    u = ''
    app = SpiderMain().crawling(u)
    asyncio.run(app)
    # 避免windows报错443
    # asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
